import time
time_start_notebook = time.time()
%%capture
import sys
ENV_COLAB = 'google.colab' in sys.modules
if ENV_COLAB:
# usual imports
!pip install watermark
!pip install scikit-plot
!pip install --upgrade git+https://github.com/stanfordmlgroup/ngboost.git
# HPO
!git clone https://github.com/thuijskens/scikit-hyperband.git
sys.path.append('scikit-hyperband/hyperband')
print('Environment: Google Colab')
sys.path.append("/Users/poudel/Dropbox/a00_Resources/hyperband")
try:
from search import HyperbandSearchCV
print('File found: search.py')
except:
print('File not found: search.py')
try:
from hyperband_search import HyperbandSearchCV
print('File found: hyperband_search.py')
except:
print('File not found: hyperband_search.py')
File not found: search.py File found: hyperband_search.py
import numpy as np
import pandas as pd
import seaborn as sns
import os,sys,time
import matplotlib.pyplot as plt
import joblib
from tqdm import tqdm_notebook as tqdm
import plotly_express as px
# modelling
from sklearn.preprocessing import OneHotEncoder
import imblearn
from imblearn.over_sampling import SMOTE
import sklearn.metrics as skmetrics
# pipeline
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.preprocessing import FunctionTransformer
# boosting
import ngboost as ngb
# settings
sns.set()
SEED = 100
pd.set_option('max_columns',100)
pd.set_option('max_colwidth',200)
pd.set_option('plotting.backend','matplotlib') # matplotlib, bokeh, altair, plotly
%matplotlib inline
%load_ext watermark
%watermark -iv
json 2.0.9 numpy 1.19.4 plotly_express 0.4.1 seaborn 0.11.0 pandas 1.1.4 joblib 0.17.0 imblearn 0.7.0 autopep8 1.5.2
def show_methods(obj, ncols=4,contains=None):
lst = [i for i in dir(obj) if i[0]!='_' ]
if contains is not None:
lst = [i for i in lst if contains in i]
df = pd.DataFrame(np.array_split(lst,ncols)).T.fillna('')
return df
def model_eval_bin(model_name,ytest,ypreds,yprobs2d,show_plots=True):
import sklearn.metrics as skmetrics
import scikitplot.metrics as skpmetrics
import os
acc = skmetrics.accuracy_score(ytest,ypreds)
precision = skmetrics.precision_score(ytest,ypreds)
recall = skmetrics.recall_score(ytest,ypreds)
f1 = skmetrics.f1_score(ytest,ypreds)
auc = skmetrics.roc_auc_score(ytest,ypreds)
print(skmetrics.classification_report(ytest,ypreds))
print(skmetrics.confusion_matrix(ytest,ypreds))
df_res = pd.DataFrame({'Accuracy':[acc],
'Precision': [precision],
'Recall': [recall],
'F1-score': [f1],
'AUC': [auc]},index=[model_name])
display(df_res.style.format("{:.4f}"))
if not os.path.isdir('../outputs'):
os.makedirs('../outputs')
o = '.' if ENV_COLAB else '../outputs/'
df_res.to_csv(o+f'model_{model_name}.csv',index=True)
if show_plots:
skpmetrics.plot_precision_recall(ytest,yprobs2d) # more focus on minority
skpmetrics.plot_roc_curve(ytest,yprobs2d) # equal focus on both groups
skpmetrics.plot_confusion_matrix(ytest,ypreds)
def get_profit(y_true, y_pred):
tn, fp, fn, tp = skmetrics.confusion_matrix(y_true,y_pred).ravel()
profit = 400*tp - 200*fn - 100*fp
return profit
scoring = skmetrics.make_scorer(get_profit, greater_is_better=True)
path_data_train = '../data/raw/train.csv'
path_data_test = '../data/raw/test.csv'
if ENV_COLAB:
path_data_train = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/train.csv'
path_data_test = 'https://raw.githubusercontent.com/bhishanpdl/Datasets/master/Projects/Telco_Customer_Churn/raw/test.csv'
df_train = pd.read_csv(path_data_train)
df_test = pd.read_csv(path_data_test)
print(df_train.shape)
print(df_test.shape)
df_train.head(2).append(df_train.tail(2))
(5634, 21) (1409, 21)
| customerID | gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Churn | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1621-YNCJH | Female | 0 | Yes | No | 36 | Yes | Yes | Fiber optic | Yes | Yes | Yes | Yes | No | Yes | Two year | Yes | Credit card (automatic) | 106.05 | 3834.4 | No |
| 1 | 7143-BQIBA | Male | 0 | No | No | 10 | Yes | No | DSL | Yes | No | No | Yes | Yes | No | Month-to-month | No | Bank transfer (automatic) | 62.25 | 612.95 | No |
| 5632 | 0862-PRCBS | Female | 0 | Yes | Yes | 68 | Yes | Yes | Fiber optic | No | Yes | No | Yes | Yes | Yes | Two year | Yes | Credit card (automatic) | 103.75 | 7039.45 | No |
| 5633 | 4656-CAURT | Male | 0 | No | No | 69 | Yes | Yes | No | No internet service | No internet service | No internet service | No internet service | No internet service | No internet service | Two year | No | Bank transfer (automatic) | 23.95 | 1713.1 | No |
target_name = 'Churn'
px.histogram(df_train, x=target_name,height=300,width=300)
px.histogram(df_train, x='gender', color=target_name,height=300,width=300)
df_train['TotalCharges'] = pd.to_numeric(df_train['TotalCharges'],errors='coerce').fillna(0)
df_test['TotalCharges'] = pd.to_numeric(df_test['TotalCharges'],errors='coerce').fillna(0)
df_train['SeniorCitizen'] = df_train['SeniorCitizen'].map({0:'No',1:'Yes'})
df_test['SeniorCitizen'] = df_test['SeniorCitizen'].map({0:'No',1:'Yes'})
df_Xtrain = df_train.drop(target_name,axis=1)
df_Xtest = df_test.drop(target_name,axis=1)
ser_ytrain = df_train[target_name].map({'No':0,'Yes':1})
ser_ytest = df_test[target_name].map({'No':0,'Yes':1})
ytrain = np.array(ser_ytrain).flatten()
ytest = np.array(ser_ytest).flatten()
index_name = 'customerID'
ser_train_ids = df_Xtrain.pop(index_name)
ser_test_ids = df_Xtest.pop(index_name)
df_Xtrain.head(2)
| gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Female | No | Yes | No | 36 | Yes | Yes | Fiber optic | Yes | Yes | Yes | Yes | No | Yes | Two year | Yes | Credit card (automatic) | 106.05 | 3834.40 |
| 1 | Male | No | No | No | 10 | Yes | No | DSL | Yes | No | No | Yes | Yes | No | Month-to-month | No | Bank transfer (automatic) | 62.25 | 612.95 |
cols_num = list(df_train.select_dtypes('number').columns)
cols_num
['tenure', 'MonthlyCharges', 'TotalCharges']
cols_cat = list(df_train.select_dtypes('object').columns)
# gender is no good predictor as seen in EDA
cols_exclude = ['customerID','gender','TotalCharges'] + [target_name]
cols_cat = [ i for i in cols_cat if i not in cols_exclude ] + ['SeniorCitizen']
print(cols_cat)
['SeniorCitizen', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'SeniorCitizen']
cols_num = ['TotalCharges','tenure', 'MonthlyCharges']
cols_num_old = cols_num
cols_cat_old = cols_cat
def combine_two_features(dfx,A,B):
dfx = dfx.copy()
assert len(A) == len(B)
for a,b in zip(A,B):
dfx[a+'_'+b] = dfx[a] + '_' + dfx[b]
return dfx
combineA = ['Partner']
combineB = ['Dependents']
combineA = combineA + ['SeniorCitizen']*5
combineB = combineB + ['Dependents','Partner','Contract',
'TechSupport','PaymentMethod']
cols_cat_new = [f'{a}_{b}' for a,b in zip(combineA,combineB)]
cols_cat = list(set(cols_cat + cols_cat_new))
print(cols_cat_new)
# print(cols_cat)
df_Xtrain = combine_two_features(df_Xtrain,combineA,combineB)
df_Xtest = combine_two_features(df_Xtest,combineA,combineB)
['Partner_Dependents', 'SeniorCitizen_Dependents', 'SeniorCitizen_Partner', 'SeniorCitizen_Contract', 'SeniorCitizen_TechSupport', 'SeniorCitizen_PaymentMethod']
def create_groupby_features(dfx,cat,num,agg):
dfx = dfx.copy()
for c in cat:
for n in num:
for a in agg:
name = f"{c}_{n}_{a}"
dfx[name] = df_train.groupby(c)[n].transform(a)
return dfx
# Using more features gave me worse AUC.
# cols_grpcat = ['Contract','PaymentMethod']
# cols_grpnum = ['TotalCharges','MonthlyCharges']
# cols_grpagg = ['mean', 'max', 'min']
cols_grpcat = ['Contract']
cols_grpnum = ['TotalCharges']
cols_grpagg = ['mean']
cols_num_new = [f'{c}_{n}_{a}'
for c in cols_grpcat
for n in cols_grpnum
for a in cols_grpagg]
cols_num = list(set(cols_num + cols_num_new))
print(cols_num_new)
# print(cols_num)
df_Xtrain = create_groupby_features(df_Xtrain,cols_grpcat, cols_grpnum, cols_grpagg)
df_Xtest = create_groupby_features(df_Xtest,cols_grpcat, cols_grpnum, cols_grpagg)
['Contract_TotalCharges_mean']
df_Xtrain.head(2)
| gender | SeniorCitizen | Partner | Dependents | tenure | PhoneService | MultipleLines | InternetService | OnlineSecurity | OnlineBackup | DeviceProtection | TechSupport | StreamingTV | StreamingMovies | Contract | PaperlessBilling | PaymentMethod | MonthlyCharges | TotalCharges | Partner_Dependents | SeniorCitizen_Dependents | SeniorCitizen_Partner | SeniorCitizen_Contract | SeniorCitizen_TechSupport | SeniorCitizen_PaymentMethod | Contract_TotalCharges_mean | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Female | No | Yes | No | 36 | Yes | Yes | Fiber optic | Yes | Yes | Yes | Yes | No | Yes | Two year | Yes | Credit card (automatic) | 106.05 | 3834.40 | Yes_No | No_No | No_Yes | No_Two year | No_Yes | No_Credit card (automatic) | 3683.643192 |
| 1 | Male | No | No | No | 10 | Yes | No | DSL | Yes | No | No | Yes | Yes | No | Month-to-month | No | Bank transfer (automatic) | 62.25 | 612.95 | No_No | No_No | No_No | No_Month-to-month | No_Yes | No_Bank transfer (automatic) | 1370.923131 |
cols_drop = ['gender']
df_Xtrain = df_Xtrain.drop(cols_drop,axis=1)
df_Xtest = df_Xtest.drop(cols_drop,axis=1)
all_features = df_Xtrain.columns.tolist()
cols_cat_idx = [all_features.index(i)
for i in cols_cat]
# make sure no nans
df_Xtrain.isna().sum().sum(), df_Xtest.isna().sum().sum()
(0, 0)
df_Xtrain_full = df_Xtrain.copy()
ser_ytrain_full = ser_ytrain.copy()
ytrain_full = np.array(ser_ytrain_full).flatten()
df_Xtrain_full = pd.get_dummies(df_Xtrain_full,columns=cols_cat,drop_first=False)
df_Xtrain = pd.get_dummies(df_Xtrain,columns=cols_cat,drop_first=False)
df_Xtest = pd.get_dummies(df_Xtest,columns=cols_cat,drop_first=False)
df_Xtrain_full.head(2).append(df_Xtrain.head(2)).append(df_Xtest.head(2))
| tenure | MonthlyCharges | TotalCharges | Contract_TotalCharges_mean | OnlineBackup_No | OnlineBackup_No internet service | OnlineBackup_Yes | DeviceProtection_No | DeviceProtection_No internet service | DeviceProtection_Yes | SeniorCitizen_PaymentMethod_No_Bank transfer (automatic) | SeniorCitizen_PaymentMethod_No_Credit card (automatic) | SeniorCitizen_PaymentMethod_No_Electronic check | SeniorCitizen_PaymentMethod_No_Mailed check | SeniorCitizen_PaymentMethod_Yes_Bank transfer (automatic) | SeniorCitizen_PaymentMethod_Yes_Credit card (automatic) | SeniorCitizen_PaymentMethod_Yes_Electronic check | SeniorCitizen_PaymentMethod_Yes_Mailed check | PhoneService_No | PhoneService_Yes | SeniorCitizen_Contract_No_Month-to-month | SeniorCitizen_Contract_No_One year | SeniorCitizen_Contract_No_Two year | SeniorCitizen_Contract_Yes_Month-to-month | SeniorCitizen_Contract_Yes_One year | SeniorCitizen_Contract_Yes_Two year | Contract_Month-to-month | Contract_One year | Contract_Two year | TechSupport_No | TechSupport_No internet service | TechSupport_Yes | SeniorCitizen_Partner_No_No | SeniorCitizen_Partner_No_Yes | SeniorCitizen_Partner_Yes_No | SeniorCitizen_Partner_Yes_Yes | StreamingTV_No | StreamingTV_No internet service | StreamingTV_Yes | Dependents_No | Dependents_Yes | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | MultipleLines_No | MultipleLines_No phone service | MultipleLines_Yes | OnlineSecurity_No | OnlineSecurity_No internet service | OnlineSecurity_Yes | SeniorCitizen_No | SeniorCitizen_Yes | Partner_No | Partner_Yes | InternetService_DSL | InternetService_Fiber optic | InternetService_No | Partner_Dependents_No_No | Partner_Dependents_No_Yes | Partner_Dependents_Yes_No | Partner_Dependents_Yes_Yes | SeniorCitizen_TechSupport_No_No | SeniorCitizen_TechSupport_No_No internet service | SeniorCitizen_TechSupport_No_Yes | SeniorCitizen_TechSupport_Yes_No | SeniorCitizen_TechSupport_Yes_No internet service | SeniorCitizen_TechSupport_Yes_Yes | SeniorCitizen_Dependents_No_No | SeniorCitizen_Dependents_No_Yes | SeniorCitizen_Dependents_Yes_No | SeniorCitizen_Dependents_Yes_Yes | PaperlessBilling_No | PaperlessBilling_Yes | StreamingMovies_No | StreamingMovies_No internet service | StreamingMovies_Yes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 36 | 106.05 | 3834.40 | 3683.643192 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
| 1 | 10 | 62.25 | 612.95 | 1370.923131 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
| 0 | 36 | 106.05 | 3834.40 | 3683.643192 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
| 1 | 10 | 62.25 | 612.95 | 1370.923131 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
| 0 | 1 | 48.60 | 48.60 | 3683.643192 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 |
| 1 | 56 | 99.90 | 5706.30 | 1370.923131 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 |
ser_ytrain_full.head(2).append(ser_ytrain.head(2)).append(ser_ytest.head(2))
0 0 1 0 0 0 1 0 0 1 1 0 Name: Churn, dtype: int64
from sklearn.model_selection import train_test_split
df_Xtrain, df_Xvalid, ser_ytrain, ser_yvalid = train_test_split(
df_Xtrain_full, ser_ytrain_full,
test_size=0.2,
random_state=SEED,
stratify=ser_ytrain_full)
ytrain = ser_ytrain.to_numpy().ravel()
yvalid = ser_yvalid.to_numpy().ravel()
print(f"df_train : {df_train.shape}\n")
print(f"df_Xtrain : {df_Xtrain.shape}")
print(f"ser_ytrain : {ser_ytrain.shape}\n")
print(f"df_Xvalid : {df_Xvalid.shape}")
print(f"ser_yvalid : {ser_yvalid.shape}\n")
print(f"df_test : {df_test.shape}")
print(f"ser_ytest : This does not exist.")
df_Xtrain.head(2)
df_train : (5634, 21) df_Xtrain : (4507, 77) ser_ytrain : (4507,) df_Xvalid : (1127, 77) ser_yvalid : (1127,) df_test : (1409, 21) ser_ytest : This does not exist.
| tenure | MonthlyCharges | TotalCharges | Contract_TotalCharges_mean | OnlineBackup_No | OnlineBackup_No internet service | OnlineBackup_Yes | DeviceProtection_No | DeviceProtection_No internet service | DeviceProtection_Yes | SeniorCitizen_PaymentMethod_No_Bank transfer (automatic) | SeniorCitizen_PaymentMethod_No_Credit card (automatic) | SeniorCitizen_PaymentMethod_No_Electronic check | SeniorCitizen_PaymentMethod_No_Mailed check | SeniorCitizen_PaymentMethod_Yes_Bank transfer (automatic) | SeniorCitizen_PaymentMethod_Yes_Credit card (automatic) | SeniorCitizen_PaymentMethod_Yes_Electronic check | SeniorCitizen_PaymentMethod_Yes_Mailed check | PhoneService_No | PhoneService_Yes | SeniorCitizen_Contract_No_Month-to-month | SeniorCitizen_Contract_No_One year | SeniorCitizen_Contract_No_Two year | SeniorCitizen_Contract_Yes_Month-to-month | SeniorCitizen_Contract_Yes_One year | SeniorCitizen_Contract_Yes_Two year | Contract_Month-to-month | Contract_One year | Contract_Two year | TechSupport_No | TechSupport_No internet service | TechSupport_Yes | SeniorCitizen_Partner_No_No | SeniorCitizen_Partner_No_Yes | SeniorCitizen_Partner_Yes_No | SeniorCitizen_Partner_Yes_Yes | StreamingTV_No | StreamingTV_No internet service | StreamingTV_Yes | Dependents_No | Dependents_Yes | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | MultipleLines_No | MultipleLines_No phone service | MultipleLines_Yes | OnlineSecurity_No | OnlineSecurity_No internet service | OnlineSecurity_Yes | SeniorCitizen_No | SeniorCitizen_Yes | Partner_No | Partner_Yes | InternetService_DSL | InternetService_Fiber optic | InternetService_No | Partner_Dependents_No_No | Partner_Dependents_No_Yes | Partner_Dependents_Yes_No | Partner_Dependents_Yes_Yes | SeniorCitizen_TechSupport_No_No | SeniorCitizen_TechSupport_No_No internet service | SeniorCitizen_TechSupport_No_Yes | SeniorCitizen_TechSupport_Yes_No | SeniorCitizen_TechSupport_Yes_No internet service | SeniorCitizen_TechSupport_Yes_Yes | SeniorCitizen_Dependents_No_No | SeniorCitizen_Dependents_No_Yes | SeniorCitizen_Dependents_Yes_No | SeniorCitizen_Dependents_Yes_Yes | PaperlessBilling_No | PaperlessBilling_Yes | StreamingMovies_No | StreamingMovies_No internet service | StreamingMovies_Yes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4555 | 16 | 19.75 | 294.90 | 1370.923131 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
| 3379 | 72 | 64.70 | 4746.05 | 3683.643192 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
Dist = distribution eg. Bernoulli,
Score = score eg. LogScore,
Base = eg. DecisionTreeRegressor()
natural_gradient = True,
n_estimators = 500,
learning_rate = 0.01,
minibatch_frac = 1.0,
col_sample = 1.0,
verbose = True,
verbose_eval = 100,
tol = 0.0001,
random_state = None,
#=================================
NGBClassifier.fit(X,Y,
X_val = None,
Y_val = None,
sample_weight = None,
val_sample_weight = None,
train_loss_monitor = None,
val_loss_monitor = None,
early_stopping_rounds = None,
)
from ngboost import NGBClassifier
# NGBClassifier?
show_methods(ngb)
| 0 | 1 | 2 | 3 | |
|---|---|---|---|---|
| 0 | NGBClassifier | NGBoost | helpers | ngboost |
| 1 | NGBRegressor | api | learners | scores |
| 2 | NGBSurvival | distns | manifold |
show_methods(ngb.NGBClassifier)
| 0 | 1 | 2 | 3 | |
|---|---|---|---|---|
| 0 | feature_importances_ | get_params | predict | set_params |
| 1 | fit | line_search | predict_proba | staged_pred_dist |
| 2 | fit_base | pred_dist | sample | staged_predict |
| 3 | fit_init_params_to_marginal | pred_param | score | staged_predict_proba |
# NGBClassifier.fit?
df_Xtrain.head()
| tenure | MonthlyCharges | TotalCharges | Contract_TotalCharges_mean | OnlineBackup_No | OnlineBackup_No internet service | OnlineBackup_Yes | DeviceProtection_No | DeviceProtection_No internet service | DeviceProtection_Yes | SeniorCitizen_PaymentMethod_No_Bank transfer (automatic) | SeniorCitizen_PaymentMethod_No_Credit card (automatic) | SeniorCitizen_PaymentMethod_No_Electronic check | SeniorCitizen_PaymentMethod_No_Mailed check | SeniorCitizen_PaymentMethod_Yes_Bank transfer (automatic) | SeniorCitizen_PaymentMethod_Yes_Credit card (automatic) | SeniorCitizen_PaymentMethod_Yes_Electronic check | SeniorCitizen_PaymentMethod_Yes_Mailed check | PhoneService_No | PhoneService_Yes | SeniorCitizen_Contract_No_Month-to-month | SeniorCitizen_Contract_No_One year | SeniorCitizen_Contract_No_Two year | SeniorCitizen_Contract_Yes_Month-to-month | SeniorCitizen_Contract_Yes_One year | SeniorCitizen_Contract_Yes_Two year | Contract_Month-to-month | Contract_One year | Contract_Two year | TechSupport_No | TechSupport_No internet service | TechSupport_Yes | SeniorCitizen_Partner_No_No | SeniorCitizen_Partner_No_Yes | SeniorCitizen_Partner_Yes_No | SeniorCitizen_Partner_Yes_Yes | StreamingTV_No | StreamingTV_No internet service | StreamingTV_Yes | Dependents_No | Dependents_Yes | PaymentMethod_Bank transfer (automatic) | PaymentMethod_Credit card (automatic) | PaymentMethod_Electronic check | PaymentMethod_Mailed check | MultipleLines_No | MultipleLines_No phone service | MultipleLines_Yes | OnlineSecurity_No | OnlineSecurity_No internet service | OnlineSecurity_Yes | SeniorCitizen_No | SeniorCitizen_Yes | Partner_No | Partner_Yes | InternetService_DSL | InternetService_Fiber optic | InternetService_No | Partner_Dependents_No_No | Partner_Dependents_No_Yes | Partner_Dependents_Yes_No | Partner_Dependents_Yes_Yes | SeniorCitizen_TechSupport_No_No | SeniorCitizen_TechSupport_No_No internet service | SeniorCitizen_TechSupport_No_Yes | SeniorCitizen_TechSupport_Yes_No | SeniorCitizen_TechSupport_Yes_No internet service | SeniorCitizen_TechSupport_Yes_Yes | SeniorCitizen_Dependents_No_No | SeniorCitizen_Dependents_No_Yes | SeniorCitizen_Dependents_Yes_No | SeniorCitizen_Dependents_Yes_Yes | PaperlessBilling_No | PaperlessBilling_Yes | StreamingMovies_No | StreamingMovies_No internet service | StreamingMovies_Yes | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 4555 | 16 | 19.75 | 294.90 | 1370.923131 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 |
| 3379 | 72 | 64.70 | 4746.05 | 3683.643192 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
| 1713 | 67 | 109.70 | 7344.45 | 3018.965636 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 |
| 2399 | 47 | 99.70 | 4747.20 | 1370.923131 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 1 |
| 1096 | 46 | 40.40 | 1842.70 | 3683.643192 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 1 | 0 | 0 | 1 | 1 | 0 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 0 | 0 | 1 | 0 | 1 | 0 | 0 |
model = NGBClassifier(random_state=SEED)
model.fit(df_Xtrain,ser_ytrain,
X_val=df_Xvalid,Y_val=ser_yvalid,
early_stopping_rounds=50
)
[iter 0] loss=0.5786 val_loss=0.5734 scale=2.0000 norm=4.0000 [iter 100] loss=0.4137 val_loss=0.4357 scale=2.0000 norm=3.4542 [iter 200] loss=0.3960 val_loss=0.4255 scale=1.0000 norm=1.7693 [iter 300] loss=0.3899 val_loss=0.4235 scale=1.0000 norm=1.7927 [iter 400] loss=0.3867 val_loss=0.4230 scale=1.0000 norm=1.8030
NGBClassifier(random_state=RandomState(MT19937) at 0x7FE1A24458D0)
show_methods(model)
| 0 | 1 | 2 | 3 | |
|---|---|---|---|---|
| 0 | Base | feature_importances_ | n_estimators | scalings |
| 1 | Dist | fit | n_features | score |
| 2 | Manifold | fit_base | natural_gradient | set_params |
| 3 | Score | fit_init_params_to_marginal | pred_dist | staged_pred_dist |
| 4 | base_models | get_params | pred_param | staged_predict |
| 5 | best_val_loss_itr | init_params | predict | staged_predict_proba |
| 6 | col_idxs | learning_rate | predict_proba | tol |
| 7 | col_sample | line_search | random_state | verbose |
| 8 | evals_result | minibatch_frac | sample | verbose_eval |
pd.DataFrame(model.evals_result) # .apply(lambda x: len(x[0])) # there are only 500 values
| train | val | |
|---|---|---|
| LOGSCORE | [0.5785940551970643, 0.5729509399706245, 0.5675739959774109, 0.5623935523512812, 0.5574944416444905, 0.5527593993819673, 0.5482746472108914, 0.5439271525790048, 0.5398046260965855, 0.5357935389329... | [0.5734267276861887, 0.5685801238065917, 0.5639597137228666, 0.5595080673018743, 0.5552927070310354, 0.5513234002960982, 0.5474226684977139, 0.5437817737575842, 0.5402801080743648, 0.5368578581230... |
ypreds = model.predict(df_Xtest)
yprobs2d = model.predict_proba(df_Xtest)
model_eval_bin('ngboost',ytest,ypreds,yprobs2d,show_plots=False)
profit = get_profit(ytest,ypreds)
print(f'profit = ${profit:,d}')
precision recall f1-score support
0 0.79 0.94 0.86 1035
1 0.65 0.31 0.42 374
accuracy 0.77 1409
macro avg 0.72 0.62 0.64 1409
weighted avg 0.75 0.77 0.74 1409
[[973 62]
[259 115]]
| Accuracy | Precision | Recall | F1-score | AUC | |
|---|---|---|---|---|---|
| ngboost | 0.7722 | 0.6497 | 0.3075 | 0.4174 | 0.6238 |
profit = $-12,000
# ypreds[:5],yprobs2d[:5]
import logging
logging.basicConfig(
filename = "ngb_errors.log",
format='%(asctime)s %(levelname)-8s %(message)s',
level=logging.INFO,
datefmt='%Y-%m-%d %H:%M:%S')
logger = logging.getLogger()
import hyperopt
from hyperopt import hp, tpe, space_eval
from hyperopt.pyll.base import scope
from hyperopt.fmin import fmin
from hyperopt import STATUS_OK, Trials
show_methods(hyperopt)
| 0 | 1 | 2 | 3 | |
|---|---|---|---|---|
| 0 | Ctrl | STATUS_OK | base | pyll_utils |
| 1 | Domain | STATUS_RUNNING | exceptions | rand |
| 2 | FMinIter | STATUS_STRINGS | fmin | space_eval |
| 3 | JOB_STATES | STATUS_SUSPENDED | fmin_pass_expr_memo_ctrl | spark |
| 4 | JOB_STATE_DONE | SparkTrials | hp | std_out_err_redirect_tqdm |
| 5 | JOB_STATE_ERROR | Trials | mix | tpe |
| 6 | JOB_STATE_NEW | absolute_import | partial | trials_from_docs |
| 7 | JOB_STATE_RUNNING | algobase | progress | utils |
| 8 | STATUS_FAIL | anneal | pyll | vectorize |
| 9 | STATUS_NEW | atpe |
show_methods(hp)
| 0 | 1 | 2 | 3 | |
|---|---|---|---|---|
| 0 | absolute_import | normal | qloguniform | randint |
| 1 | choice | pchoice | qnormal | uniform |
| 2 | lognormal | qlognormal | quniform | uniformint |
| 3 | loguniform |
b1 = DecisionTreeClassifier(criterion='friedman_mse', max_depth=2)
b2 = DecisionTreeClassifier(criterion='friedman_mse', max_depth=3)
b3 = DecisionTreeClassifier(criterion='friedman_mse', max_depth=4)
space = {
'n_estimators' :hp.randint('n_estimators',10, 1000),
'learning_rate' :hp.uniform('learning_rate', .05, 1),
'col_sample' :hp.uniform('col_sample', 0.6, 1.0),
'minibatch_frac':hp.choice('minibatch_frac', [1.0, 0.5]),
'Base' :hp.choice('Base', [b1, b2, b3])
}
params_fixed = {"verbose_eval":1,"random_state":SEED}
def objective(params):
params.update(params_fixed)
print(params)
model = NGBClassifier(**params)
model.fit(df_Xtrain,ser_ytrain,
X_val=df_Xvalid,
Y_val=ser_yvalid,
early_stopping_rounds=10)
vdpreds = model.predict(df_Xvalid)
profit = get_profit(yvalid,vdpreds)
print(profit)
logger.info(params)
results = {'loss':-profit, 'status':STATUS_OK}
return results
TRIALS = Trials()
logger.info("Start parameter optimization...")
with warnings.catch_warnings():
warnings.simplefilter("ignore")
best = fmin(fn=objective,
space=space,
algo=tpe.suggest,
max_evals=100,
trials=TRIALS)
logger.info("...done")
2020-12-27 12:03:44 INFO Start parameter optimization...
0%| | 0/100 [00:00<?, ?trial/s, best loss=?]
2020-12-27 12:03:44 INFO build_posterior_wrapper took 0.001793 seconds 2020-12-27 12:03:44 INFO TPE using 0 trials
{'col_sample': 0.9426953144148809, 'learning_rate': 0.4569258913355538, 'minibatch_frac': 0.5, 'n_estimators': 211, 'verbose_eval': 1, 'random_state': 100}
[iter 0] loss=0.5791 val_loss=0.4659 scale=2.0000 norm=4.0024
[iter 1] loss=0.4421 val_loss=0.4489 scale=2.0000 norm=3.5798
[iter 2] loss=0.4318 val_loss=0.4433 scale=1.0000 norm=1.9092
[iter 3] loss=0.4291 val_loss=0.4414 scale=1.0000 norm=1.9352
[iter 4] loss=0.3947 val_loss=0.4401 scale=1.0000 norm=1.8186
[iter 5] loss=0.4091 val_loss=0.4456 scale=1.0000 norm=1.9597
[iter 6] loss=0.4058 val_loss=0.4505 scale=2.0000 norm=3.8991
[iter 7] loss=0.4057 val_loss=0.4528 scale=0.5000 norm=21.6208
0%| | 0/100 [00:00<?, ?trial/s, best loss=?]
2020-12-27 12:03:44 ERROR job exception: Singular matrix
0%| | 0/100 [00:00<?, ?trial/s, best loss=?]
--------------------------------------------------------------------------- LinAlgError Traceback (most recent call last) <ipython-input-85-44ffdb0f1154> in <module> 8 algo=tpe.suggest, 9 max_evals=100, ---> 10 trials=TRIALS) 11 logger.info("...done") ~/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/hyperopt/fmin.py in fmin(fn, space, algo, max_evals, timeout, loss_threshold, trials, rstate, allow_trials_fmin, pass_expr_memo_ctrl, catch_eval_exceptions, verbose, return_argmin, points_to_evaluate, max_queue_len, show_progressbar) 480 catch_eval_exceptions=catch_eval_exceptions, 481 return_argmin=return_argmin, --> 482 show_progressbar=show_progressbar, 483 ) 484 ~/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/hyperopt/base.py in fmin(self, fn, space, algo, max_evals, timeout, loss_threshold, max_queue_len, rstate, verbose, pass_expr_memo_ctrl, catch_eval_exceptions, return_argmin, show_progressbar) 684 catch_eval_exceptions=catch_eval_exceptions, 685 return_argmin=return_argmin, --> 686 show_progressbar=show_progressbar, 687 ) 688 ~/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/hyperopt/fmin.py in fmin(fn, space, algo, max_evals, timeout, loss_threshold, trials, rstate, allow_trials_fmin, pass_expr_memo_ctrl, catch_eval_exceptions, verbose, return_argmin, points_to_evaluate, max_queue_len, show_progressbar) 507 508 # next line is where the fmin is actually executed --> 509 rval.exhaust() 510 511 if return_argmin: ~/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/hyperopt/fmin.py in exhaust(self) 328 def exhaust(self): 329 n_done = len(self.trials) --> 330 self.run(self.max_evals - n_done, block_until_done=self.asynchronous) 331 self.trials.refresh() 332 return self ~/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/hyperopt/fmin.py in run(self, N, block_until_done) 284 else: 285 # -- loop over trials and do the jobs directly --> 286 self.serial_evaluate() 287 288 self.trials.refresh() ~/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/hyperopt/fmin.py in serial_evaluate(self, N) 163 ctrl = base.Ctrl(self.trials, current_trial=trial) 164 try: --> 165 result = self.domain.evaluate(spec, ctrl) 166 except Exception as e: 167 logger.error("job exception: %s" % str(e)) ~/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/hyperopt/base.py in evaluate(self, config, ctrl, attach_attachments) 892 print_node_on_error=self.rec_eval_print_node_on_error, 893 ) --> 894 rval = self.fn(pyll_rval) 895 896 if isinstance(rval, (float, int, np.number)): <ipython-input-84-96472c3df56e> in objective(params) 8 X_val=df_Xvalid, 9 Y_val=ser_yvalid, ---> 10 early_stopping_rounds=10) 11 12 vdpreds = model.predict(df_Xvalid) ~/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/ngboost/ngboost.py in fit(self, X, Y, X_val, Y_val, sample_weight, val_sample_weight, train_loss_monitor, val_loss_monitor, early_stopping_rounds) 253 loss_list += [train_loss_monitor(D, Y_batch, weight_batch)] 254 loss = loss_list[-1] --> 255 grads = D.grad(Y_batch, natural=self.natural_gradient) 256 257 proj_grad = self.fit_base(X_batch, grads, weight_batch) ~/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/ngboost/scores.py in grad(self, Y, natural) 10 if natural: 11 metric = self.metric() ---> 12 grad = np.linalg.solve(metric, grad) 13 return grad 14 <__array_function__ internals> in solve(*args, **kwargs) ~/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/numpy/linalg/linalg.py in solve(a, b) 392 signature = 'DD->D' if isComplexType(t) else 'dd->d' 393 extobj = get_linalg_error_extobj(_raise_linalgerror_singular) --> 394 r = gufunc(a, b, signature=signature, extobj=extobj) 395 396 return wrap(r.astype(result_t, copy=False)) ~/opt/miniconda3/envs/dataSc/lib/python3.7/site-packages/numpy/linalg/linalg.py in _raise_linalgerror_singular(err, flag) 86 87 def _raise_linalgerror_singular(err, flag): ---> 88 raise LinAlgError("Singular matrix") 89 90 def _raise_linalgerror_nonposdef(err, flag): LinAlgError: Singular matrix
model_eval_bin('catboost+optuna',ytest,ypreds,yprobs2d,show_plots=True)
time_taken = time.time() - time_start_notebook
h,m = divmod(time_taken,60*60)
print('Time taken to run whole notebook: {:.0f} hr '\
'{:.0f} min {:.0f} secs'.format(h, *divmod(m,60)))